#### Summary statistics
## This file takes the "IGM_Dataset" file from 0_CleanData.R and merges in the 
# landing information 

# OUTPUT: this file creates the "cohort.csv" file 


library(haven)
library(readr)
library(data.table)
library(scales)
library(lfe)
library(stargazer)

sample <- fread("H:/Zheng_10223/Joint/IGM_dataset_2025.csv")
length(unique(sample$IMDB_ID))
cohort <- sample[!is.na(sample$MainParent_Income_HH_MainParentAge45_49) & !is.na(sample$Child_Income_IND_30_34_pct),]

length(unique(cohort$IMDB_ID))
child_landing <- read_dta("H:/Zheng_10223/Joint/child_pnrf_1980_2020_f3_v1.dta")
child_landing <- data.table(child_landing)
child_landing$PARENT3=child_landing$Parent3; child_landing$Parent3=NULL
child_landing$PARENT4=child_landing$Parent4; child_landing$Parent4=NULL

cohort <- merge(x=cohort, y=child_landing[,c("IMDB_ID", "PARENT3", "PARENT4")], by.x="IMDB_ID", by.y="IMDB_ID")


landing <- read_dta("G:/IMDB_AllYears/rdc/IMDB_BDIM_2022_v1/data_donnees/stata/Core_IMDB/pnrf_1980_2022_f3_v1.dta")
landing <- data.table(landing)


# replace years of schooling 99 value not stated with 10 (so it goes in the less than 12 schooling bin)
landing$YEARS_OF_SCHOOLING[landing$YEARS_OF_SCHOOLING==99]=10

# Country of residence, birth, citizenship
country_crosswalk <- read_csv("H:/Zheng_10223/Joint/countryfiles/country_crosswalk.csv")
landing$COUNTRY_RESIDENCE <- as.numeric(landing$COUNTRY_RESIDENCE)
landing <- merge(x=landing, y=country_crosswalk, by.x="COUNTRY_RESIDENCE", by.y="Code")
setnames(landing, old=c("Country"), new=c("CountryOfLastResidence"))

cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "WORLD_AREA_BIRTH", "CountryOfLastResidence", "DESTINATION_CMA")], by.x="IMDB_ID", by.y="IMDB_ID")
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "YEARS_OF_SCHOOLING","EDUCATION_QUALIFICATION" ,"LANDING_YEAR")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "YEARS_OF_SCHOOLING","EDUCATION_QUALIFICATION",  "LANDING_YEAR")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("YEARS_OF_SCHOOLING.x", "YEARS_OF_SCHOOLING.y",  "LANDING_YEAR.x", "LANDING_YEAR.y","EDUCATION_QUALIFICATION.x","EDUCATION_QUALIFICATION.y"), new=c("YEARS_OF_SCHOOLING_PARENT1", "YEARS_OF_SCHOOLING_PARENT2", "Landing_Year1", "Landing_Year2","EDUCATION_QUALIFICATION1","EDUCATION_QUALIFICATION2"))
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "YEARS_OF_SCHOOLING","EDUCATION_QUALIFICATION")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("YEARS_OF_SCHOOLING","EDUCATION_QUALIFICATION"), new=c("YEARS_OF_SCHOOLING_MainParent","EDUCATION_QUALIFICATION_Main"))


### Categorize immigration class using main parent, parent 1, 2 and child
# Note that even using up to four individuals, some children are always spouse/dependent
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("IMMIGRATION_CATEGORY_CENSUS.x", "IMMIGRATION_CATEGORY_CENSUS.y", "IMMIGRATION_CATEGORY_CENSUS"), new=c("Immigrantcategory_main", "Immigrantcategory1", "Immigrantcategory2"))

cohort <- merge(x=cohort, y=child_landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="IMDB_ID", by.y="IMDB_ID")
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="PARENT3", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "IMMIGRATION_CATEGORY_CENSUS")], by.x="PARENT4", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("IMMIGRATION_CATEGORY_CENSUS.x", "IMMIGRATION_CATEGORY_CENSUS.y", "IMMIGRATION_CATEGORY_CENSUS"), new=c("Immigrantcategory_child", "Immigrantcategory3", "Immigrantcategory4"))

# marital status for the child 
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "FAMILY_STATUS", "FAMILY_STATUS_ROLLUP", "MARITAL_STATUS")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T)

cohort <- cohort[!FAMILY_STATUS %in% c(2,5) & MARITAL_STATUS %in% c(3,9)]



# last tax years and death for the child:   can do better with emigrant status
#cohort <- merge (x=cohort, y=landing[,c("IMDB_ID",)])



# marital status for the parent 
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID",  "MARITAL_STATUS","MARITAL_STATUS_ROLLUP")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("MARITAL_STATUS.x","MARITAL_STATUS.y","MARITAL_STATUS_ROLLUP"), new=c("Marital_status_child","Marital_status_main","MARITAL_STATUS_ROLLUP_main"))




cohort$ImmigrationCategory <- NA
cohort$ImmigrationCategory[which(cohort$Immigrantcategory_main %in% c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300')|
                                 cohort$Immigrantcategory1 %in% c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300')|
                                 cohort$Immigrantcategory2 %in% c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300')|
                                 cohort$Immigrantcategory_child %in% c('A1111', 'A1115', 'A1120', 'A1130', 'A1141', 'A1142', 'A1150', 'A1160', 'A1231', 'A1235', 'A1300'))] <- "Economic"

cohort$ImmigrationCategory[which(cohort$Immigrantcategory_main %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225')|
                                 cohort$Immigrantcategory1 %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225')|
                                 cohort$Immigrantcategory2 %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225')|
                                 cohort$Immigrantcategory_child %in% c('A1211', 'A1212', 'A1215', 'A1221', 'A1225'))] <- "Business"

cohort$ImmigrationCategory[which(cohort$Immigrantcategory_main %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991')|
                                 cohort$Immigrantcategory1 %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991')|
                                 cohort$Immigrantcategory2 %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991')|
                                 cohort$Immigrantcategory_child %in% c('C3110', 'C3120', 'C3210', 'C3220', 'C3230', 'D4110', 'D4120', 'D4210', 'Z9991'))]<- "Refugee"

cohort$ImmigrationCategory[which(is.na(cohort$ImmigrationCategory))] <- "Missing"

#cohort <- merge(x=cohort, y=landing[,c("IMDB_ID",)], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "gender")], by.x="IMDB_ID", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("gender"), new=c("Gender_Child"))

cohort$FemaleChild <- (cohort$Gender_Child==2)
cohort$FemaleChild[which(!cohort$Gender_Child  %in% c(1,2))] <- NA
cohort$Status_Economic <- (cohort$ImmigrationCategory=="Economic")
cohort$Status_Business <- (cohort$ImmigrationCategory=="Business")
cohort$Status_Refugee <- (cohort$ImmigrationCategory=="Refugee")
cohort$Status_Unknown <- (cohort$ImmigrationCategory=="Missing")

cohort$WORLD_AREA_BIRTH <- as.character(cohort$WORLD_AREA_BIRTH)
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="1")] <- "Europe"; cohort$Europe <- cohort$WORLD_AREA_BIRTH=="Europe"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="2")] <- "Africa and Middle East"; cohort$AfricaMiddleEast <- cohort$WORLD_AREA_BIRTH=="Africa and Middle East"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="3")] <- "Southern Asia"; cohort$SouthernAsia <- cohort$WORLD_AREA_BIRTH=="Southern Asia"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="4")] <- "Eastern Asia"; cohort$EasternAsia <- cohort$WORLD_AREA_BIRTH=="Eastern Asia"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="5")] <- "Oceania and other Asia"; cohort$Oceania <- cohort$WORLD_AREA_BIRTH=="Oceania and other Asia"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="6")] <- "South and Central America"; cohort$CentralSouthAmerica <- cohort$WORLD_AREA_BIRTH=="South and Central America"
cohort$WORLD_AREA_BIRTH[which(cohort$WORLD_AREA_BIRTH=="7")] <- "US or other"; cohort$USOrOther <- cohort$WORLD_AREA_BIRTH=="US or other"

cohort$DESTINATION_CMA <- as.character(cohort$DESTINATION_CMA)
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="35535")] <- "Toronto"; cohort$Toronto <- cohort$DESTINATION_CMA=="Toronto"
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="24462")] <- "Montreal"; cohort$Montreal <- cohort$DESTINATION_CMA=="Montreal"
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="59933")] <- "Vancouver"; cohort$Vancouver <- cohort$DESTINATION_CMA=="Vancouver"
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="48825")] <- "Calgary"; cohort$Calgary <- cohort$DESTINATION_CMA=="Calgary"
cohort$DESTINATION_CMA[which(cohort$DESTINATION_CMA=="48835")] <- "Edmonton"; cohort$Edmonton <- cohort$DESTINATION_CMA=="Edmonton"


# Max year of schooling among two parents
cohort$YEARS_OF_SCHOOLING_PARENT1[which(cohort$YEARS_OF_SCHOOLING_PARENT1==99)] <- NA; cohort$YEARS_OF_SCHOOLING_PARENT1[which(cohort$YEARS_OF_SCHOOLING_PARENT1<0 | cohort$YEARS_OF_SCHOOLING_PARENT1>23)] <- NA; 
cohort$YEARS_OF_SCHOOLING_PARENT2[which(cohort$YEARS_OF_SCHOOLING_PARENT2==99)] <- NA; cohort$YEARS_OF_SCHOOLING_PARENT2[which(cohort$YEARS_OF_SCHOOLING_PARENT2<0 | cohort$YEARS_OF_SCHOOLING_PARENT2>23)] <- NA; 
cohort$YEARS_OF_SCHOOLING_PARENT_MAX=pmax(cohort$YEARS_OF_SCHOOLING_PARENT1, cohort$YEARS_OF_SCHOOLING_PARENT2, na.rm=T)
cohort$SchoolingBins_max <- NA
cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX<12)] <- "<12"
cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX==12)] <- "12"
cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX>12 & cohort$YEARS_OF_SCHOOLING_PARENT_MAX<=15)] <- "12-15"
cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX==16)] <- "16"
cohort$SchoolingBins_max[which(cohort$YEARS_OF_SCHOOLING_PARENT_MAX>16 & cohort$YEARS_OF_SCHOOLING_PARENT_MAX<99)] <- ">16"
cohort$LTHS <- (cohort$SchoolingBins=="<12")
cohort$HSGrad <- (cohort$SchoolingBins=="12")
cohort$SomeCollege <- (cohort$SchoolingBins=="12-15")
cohort$College <- (cohort$SchoolingBins=="16")
cohort$CollegePlus <- (cohort$SchoolingBins==">16")

# English ability of main parent
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)

setnames(cohort, old=c("ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE"), new=c("ENGLISH_IND_PRIOR2019_Main", "ENGLISH_IND_Main", "FRENCH_IND_PRIOR2019_Main", "FRENCH_IND_Main", "OFFICIAL_LANGUAGE_Main"))


cohort$EnglishMotherTongue_Main <- (cohort$ENGLISH_IND_Main==1|cohort$ENGLISH_IND_PRIOR2019_Main==1)
cohort$AnyEnglish_Main <- (cohort$OFFICIAL_LANGUAGE_Main %in% c(1,3) | cohort$ENGLISH_IND_Main==1|cohort$ENGLISH_IND_PRIOR2019_Main==1)
cohort$FrenchMotherTongue_Main <- (cohort$FRENCH_IND_Main==1|cohort$FRENCH_IND_PRIOR2019_Main==1)
cohort$AnyFrench_Main <- (cohort$OFFICIAL_LANGUAGE_Main %in% c(2,3) | cohort$FRENCH_IND_Main==1|cohort$FRENCH_IND_PRIOR2019_Main==1)


# Language for Parent 1

cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE"), new=c("ENGLISH_IND_PRIOR2019_Parent1", "ENGLISH_IND_Parent1", "FRENCH_IND_PRIOR2019_Parent1", "FRENCH_IND_Parent1", "OFFICIAL_LANGUAGE_Parent1"))



cohort$EnglishMotherTongue_Parent1 <- (cohort$ENGLISH_IND_Parent1==1|cohort$ENGLISH_IND_PRIOR2019_Parent1==1)
cohort$AnyEnglish_Parent1 <- (cohort$OFFICIAL_LANGUAGE_Parent1 %in% c(1,3) | cohort$ENGLISH_IND_Parent1==1|cohort$ENGLISH_IND_PRIOR2019_Parent1==1)
cohort$FrenchMotherTongue_Parent1 <- (cohort$FRENCH_IND_Parent1==1|cohort$FRENCH_IND_PRIOR2019_Parent1==1)
cohort$AnyFrench_Parent1 <- (cohort$OFFICIAL_LANGUAGE_Parent1 %in% c(2,3) | cohort$FRENCH_IND_Parent1==1|cohort$FRENCH_IND_PRIOR2019_Parent1==1)


# Language for Parent 2 



cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
setnames(cohort, old=c("ENGLISH_IND_PRIOR2019", "ENGLISH_IND", "FRENCH_IND_PRIOR2019", "FRENCH_IND", "OFFICIAL_LANGUAGE"), new=c("ENGLISH_IND_PRIOR2019_Parent2", "ENGLISH_IND_Parent2", "FRENCH_IND_PRIOR2019_Parent2", "FRENCH_IND_Parent2", "OFFICIAL_LANGUAGE_Parent2"))



cohort$EnglishMotherTongue_Parent2 <- (cohort$ENGLISH_IND_Parent2==1|cohort$ENGLISH_IND_PRIOR2019_Parent2==1)
cohort$AnyEnglish_Parent2 <- (cohort$OFFICIAL_LANGUAGE_Parent2 %in% c(1,3) | cohort$ENGLISH_IND_Parent2==1|cohort$ENGLISH_IND_PRIOR2019_Parent2==1)
cohort$FrenchMotherTongue_Parent2 <- (cohort$FRENCH_IND_Parent2==1|cohort$FRENCH_IND_PRIOR2019_Parent2==1)
cohort$AnyFrench_Parent2 <- (cohort$OFFICIAL_LANGUAGE_Parent2 %in% c(2,3) | cohort$FRENCH_IND_Parent2==1|cohort$FRENCH_IND_PRIOR2019_Parent2==1)



# INTENDED OCCUPATION AND SKILL FOR mAIN 
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11")], by.x="MAIN_PARENT", by.y="IMDB_ID", all.x=T)
setnames(cohort,old=c("NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11"), new=c("NOC2_CD11_Main","NOC3_CD11_Main","SKILL_LEVEL_CD11_Main"))





# INTENDED OCCUPATION AND SKILL FOR Parent 1 
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11")], by.x="PARENT1", by.y="IMDB_ID", all.x=T)
setnames(cohort,old=c("NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11"), new=c("NOC2_CD11_Parent1","NOC3_CD11_Parent1","SKILL_LEVEL_CD11_Parent1"))

# INTENDED OCCUPATION AND SKILL FOR Parent 2 
cohort <- merge(x=cohort, y=landing[,c("IMDB_ID", "NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11")], by.x="PARENT2", by.y="IMDB_ID", all.x=T)
setnames(cohort,old=c("NOC2_CD11","NOC3_CD11","SKILL_LEVEL_CD11"), new=c("NOC2_CD11_Parent2","NOC3_CD11_Parent2","SKILL_LEVEL_CD11_Parent2"))



######## Language, Schooling, Skill for Father and Top Parent: 

# Language 
cohort$AnyEnglish_Father=NA
cohort$AnyEnglish_Father[cohort$Father=="Parent1"]=cohort$AnyEnglish_Parent1[cohort$Father=="Parent1"]
cohort$AnyEnglish_Father[cohort$Father=="Parent2"]=cohort$AnyEnglish_Parent2[cohort$Father=="Parent2"]
cohort$AnyEnglish_Father[cohort$Father=="MAIN_PARENT"]=cohort$AnyEnglish_Main[cohort$Father=="MAIN_PARENT"]

cohort$AnyFrench_Father=NA
cohort$AnyFrench_Father[cohort$Father=="Parent1"]=cohort$AnyFrench_Parent1[cohort$Father=="Parent1"]
cohort$AnyFrench_Father[cohort$Father=="Parent2"]=cohort$AnyFrench_Parent2[cohort$Father=="Parent2"]
cohort$AnyFrench_Father[cohort$Father=="MAIN_PARENT"]=cohort$AnyFrench_Main[cohort$Father=="MAIN_PARENT"]

cohort$AnyEnglish_Top=NA
cohort$AnyFrench_Top=NA
cohort$AnyEnglish_Top[cohort$TopParent=="Parent1"]=cohort$AnyEnglish_Parent1[cohort$TopParent=="Parent1"]
cohort$AnyEnglish_Top[cohort$TopParent=="Parent2"]=cohort$AnyEnglish_Parent2[cohort$TopParent=="Parent2"]
cohort$AnyEnglish_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$AnyEnglish_Main[cohort$TopParent=="MAIN_PARENT"]

cohort$AnyFrench_Top[cohort$TopParent=="Parent1"]=cohort$AnyFrench_Parent1[cohort$TopParent=="Parent1"]
cohort$AnyFrench_Top[cohort$TopParent=="Parent2"]=cohort$AnyFrench_Parent2[cohort$TopParent=="Parent2"]
cohort$AnyFrench_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$AnyFrench_Main[cohort$TopParent=="MAIN_PARENT"]



# Schooling 
cohort$SchoolingBins_Main <- NA
cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent<12)] <- "<12"
cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent==12)] <- "12"
cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent>12 & cohort$YEARS_OF_SCHOOLING_MainParent<=15)] <- "12-15"
cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent==16)] <- "16"
cohort$SchoolingBins_Main[which(cohort$YEARS_OF_SCHOOLING_MainParent>16 & cohort$YEARS_OF_SCHOOLING_MainParent<99)] <- ">16"

cohort$schooling_Father=NA
cohort$schooling_Father[cohort$Father=="Parent1"]=cohort$YEARS_OF_SCHOOLING_PARENT1[cohort$Father=="Parent1"]
cohort$schooling_Father[cohort$Father=="Parent2"]=cohort$YEARS_OF_SCHOOLING_PARENT2[cohort$Father=="Parent2"]
cohort$schooling_Father[cohort$Father=="MAIN_PARENT"]=cohort$YEARS_OF_SCHOOLING_MainParent[cohort$Father=="MAIN_PARENT"]

cohort$educqual_Father=NA
cohort$educqual_Father[cohort$Father=="Parent1"]=cohort$EDUCATION_QUALIFICATION1[cohort$Father=="Parent1"]
cohort$educqual_Father[cohort$Father=="Parent2"]=cohort$EDUCATION_QUALIFICATION2[cohort$Father=="Parent2"]
cohort$educqual_Father[cohort$Father=="MAIN_PARENT"]=cohort$EDUCATION_QUALIFICATION_Main[cohort$Father=="MAIN_PARENT"]

cohort$educqual_Top=NA
cohort$educqual_Top[cohort$TopParent=="Parent1"]=cohort$EDUCATION_QUALIFICATION1[cohort$TopParent=="Parent1"]
cohort$educqual_Top[cohort$TopParent=="Parent2"]=cohort$EDUCATION_QUALIFICATION2[cohort$TopParent=="Parent2"]
cohort$educqual_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$EDUCATION_QUALIFICATION_Main[cohort$TopParent=="MAIN_PARENT"]

cohort$SchoolingBins_Father <- NA
cohort$SchoolingBins_Father[which(cohort$schooling_Father<12)] <- "<12"
cohort$SchoolingBins_Father[which(cohort$schooling_Father==12)] <- "12"
cohort$SchoolingBins_Father[which(cohort$schooling_Father>12 & cohort$schooling_Father<=15)] <- "12-15"
cohort$SchoolingBins_Father[which(cohort$schooling_Father==16)] <- "16"
cohort$SchoolingBins_Father[which(cohort$schooling_Father>16 & cohort$schooling_Father<99)] <- ">16"

cohort$schooling_Top=NA
cohort$schooling_Top[cohort$TopParent=="Parent1"]=cohort$YEARS_OF_SCHOOLING_PARENT1[cohort$TopParent=="Parent1"]
cohort$schooling_Top[cohort$TopParent=="Parent2"]=cohort$YEARS_OF_SCHOOLING_PARENT2[cohort$TopParent=="Parent2"]
cohort$schooling_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$YEARS_OF_SCHOOLING_MainParent[cohort$TopParent=="MAIN_PARENT"]

cohort$SchoolingBins_Top <- NA
cohort$SchoolingBins_Top[which(cohort$schooling_Top<12)] <- "<12"
cohort$SchoolingBins_Top[which(cohort$schooling_Top==12)] <- "12"
cohort$SchoolingBins_Top[which(cohort$schooling_Top>12 & cohort$schooling_Top<=15)] <- "12-15"
cohort$SchoolingBins_Top[which(cohort$schooling_Top==16)] <- "16"
cohort$SchoolingBins_Top[which(cohort$schooling_Top>16 & cohort$schooling_Top<99)] <- ">16"


# Skill 
cohort$SKILL_LEVEL_CD11_Father= NA
cohort$SKILL_LEVEL_CD11_Father[cohort$Father=="Parent1"]=cohort$SKILL_LEVEL_CD11_Parent1[cohort$Father=="Parent1"]
cohort$SKILL_LEVEL_CD11_Father[cohort$Father=="Parent2"]=cohort$SKILL_LEVEL_CD11_Parent2[cohort$Father=="Parent2"]
cohort$SKILL_LEVEL_CD11_Father[cohort$Father=="MAIN_PARENT"]=cohort$SKILL_LEVEL_CD11_Main[cohort$Father=="MAIN_PARENT"]

cohort$NOC2_CD11_Father= NA
cohort$NOC2_CD11_Father[cohort$Father=="Parent1"]=cohort$NOC2_CD11_Parent1[cohort$Father=="Parent1"]
cohort$NOC2_CD11_Father[cohort$Father=="Parent2"]=cohort$NOC2_CD11_Parent2[cohort$Father=="Parent2"]
cohort$NOC2_CD11_Father[cohort$Father=="MAIN_PARENT"]=cohort$NOC2_CD11_Main[cohort$Father=="MAIN_PARENT"]

cohort$NOC3_CD11_Father= NA
cohort$NOC3_CD11_Father[cohort$Father=="Parent1"]=cohort$NOC3_CD11_Parent1[cohort$Father=="Parent1"]
cohort$NOC3_CD11_Father[cohort$Father=="Parent2"]=cohort$NOC3_CD11_Parent2[cohort$Father=="Parent2"]
cohort$NOC3_CD11_Father[cohort$Father=="MAIN_PARENT"]=cohort$NOC3_CD11_Main[cohort$Father=="MAIN_PARENT"]


cohort$SKILL_LEVEL_CD11_Top= NA
cohort$SKILL_LEVEL_CD11_Top[cohort$TopParent=="Parent1"]=cohort$SKILL_LEVEL_CD11_Parent1[cohort$TopParent=="Parent1"]
cohort$SKILL_LEVEL_CD11_Top[cohort$TopParent=="Parent2"]=cohort$SKILL_LEVEL_CD11_Parent2[cohort$TopParent=="Parent2"]
cohort$SKILL_LEVEL_CD11_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$SKILL_LEVEL_CD11_Main[cohort$TopParent=="MAIN_PARENT"]

cohort$NOC2_CD11_Top= NA
cohort$NOC2_CD11_Top[cohort$TopParent=="Parent1"]=cohort$NOC2_CD11_Parent1[cohort$TopParent=="Parent1"]
cohort$NOC2_CD11_Top[cohort$TopParent=="Parent2"]=cohort$NOC2_CD11_Parent2[cohort$TopParent=="Parent2"]
cohort$NOC2_CD11_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$NOC2_CD11_Main[cohort$TopParent=="MAIN_PARENT"]

cohort$NOC3_CD11_Top= NA
cohort$NOC3_CD11_Top[cohort$TopParent=="Parent1"]=cohort$NOC3_CD11_Parent1[cohort$TopParent=="Parent1"]
cohort$NOC3_CD11_Top[cohort$TopParent=="Parent2"]=cohort$NOC3_CD11_Parent2[cohort$TopParent=="Parent2"]
cohort$NOC3_CD11_Top[cohort$TopParent=="MAIN_PARENT"]=cohort$NOC3_CD11_Main[cohort$TopParent=="MAIN_PARENT"]


cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="0"]="Managerial/Professional"
cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="A"]="Managerial/Professional"
cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="B"]="Skilled and Technical"
cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="C"]="Clerical and Laborers"
cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="D"]="Clerical and Laborers"
cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main=="N"]="New Workers"
cohort$IntendedOccupation_Main[cohort$SKILL_LEVEL_CD11_Main %in% c("O","R","S")]="Non-Workers"

cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="0"]="Managerial/Professional"
cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="A"]="Managerial/Professional"
cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="B"]="Skilled and Technical"
cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="C"]="Clerical and Laborers"
cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="D"]="Clerical and Laborers"
cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father=="N"]="New Workers"
cohort$IntendedOccupation_Father[cohort$SKILL_LEVEL_CD11_Father %in% c("O","R","S")]="Non-Workers"

cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="0"]="Managerial/Professional"
cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="A"]="Managerial/Professional"
cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="B"]="Skilled and Technical"
cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="C"]="Clerical and Laborers"
cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="D"]="Clerical and Laborers"
cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top=="N"]="New Workers"
cohort$IntendedOccupation_Top[cohort$SKILL_LEVEL_CD11_Top %in% c("O","R","S")]="Non-Workers"





################################################### clean
library(dplyr)
################## country where CHILD was born in:
cohort=merge(x=cohort, y=landing[,c("IMDB_ID","COUNTRY_BIRTH")], by="IMDB_ID")
cohort$birthcountry_child=cohort$COUNTRY_BIRTH; cohort$COUNTRY_BIRTH=NULL

# imposes common support 
cohort$refugeein=ifelse(cohort$ImmigrationCategory=="Refugee",1,0)
countrycount=cohort %>% group_by(birthcountry_child,refugeein) %>% summarize(count=n()) %>% ungroup()

# keep with groups of more than 30 people
countrycountsum=countrycount %>% filter (count>30) %>% group_by(birthcountry_child) %>% summarize(count=n())



#indicator if more than 30 people 
countrykeep=countrycountsum$birthcountry_child[countrycountsum$count>1]

cohort$birthcountrydrop=ifelse(cohort$birthcountry_child %in% countrykeep,0,1)


########### cma tab 
# CMA tabulations
dfcmatab=cohort %>% group_by(DESTINATION_CMA, refugeein) %>% summarize(count=n()) %>% ungroup()

cmacountsum= dfcmatab %>% filter(count>30) %>% group_by(DESTINATION_CMA) %>% summarize(count=n())
keepcma=cmacountsum$DESTINATION_CMA[cmacountsum$count>1]

cohort$cmadrop=ifelse(cohort$DESTINATION_CMA %in% keepcma,0,1)

# Refugee type 
# get list of imdb_ids that came under private sponsor refugee
privaterefugee_ids=landing$IMDB_ID[landing$IMMIGRATION_CATEGORY_CENSUS %in% c("C3220")]
blendedrefugee_ids=landing$IMDB_ID[landing$IMMIGRATION_CATEGORY_CENSUS %in% c("C3230")]


cohort$refugeetype=NA
cohort$refugeetype[cohort$IMDB_ID %in% privaterefugee_ids]="Private"
cohort$refugeetype[cohort$IMDB_ID %in% blendedrefugee_ids]="Blended"
cohort$refugeetype[cohort$ImmigrationCategory=="Refugee" & is.na(cohort$refugeetype)]="Public"

# destination province: manage qc
cohort=merge(cohort,landing[,c("IMDB_ID","DESTINATION_PROVINCE")], by="IMDB_ID")

fwrite(cohort,"H:/Zheng_10223/Joint/cohort2025_prenrf.csv")

############### Merge in NRF 
nrf=read_dta("G:/IMDB_AllYears/rdc/IMDB_BDIM_2022_v1/data_donnees/stata/Core_IMDB/nrf_permit_1980_2022_f3_v1.dta")

# get first permit: OF PARENT
nrfmin=nrf %>% group_by(IMDB_ID) %>% mutate(minpermitdate=min(EFFECTIVE_DATE)) %>% ungroup()

nrfmin=nrfmin[nrfmin$EFFECTIVE_DATE==nrfmin$minpermitdate,]

nrfmin$minlandingyear=substr(nrfmin$EFFECTIVE_DATE,1,4)

nrfmin=nrfmin[,c("IMDB_ID","minlandingyear","EFFECTIVE_DATE","DOCUMENT_TYPE","VALID_DATE","DESTINATION_PROVINCE")]


# some people come with work and study permit at same time: eliminate study permit

nrfmin=nrfmin %>% group_by(IMDB_ID) %>% mutate(countimdb=n())%>% ungroup()

nrfmin=nrfmin %>% filter(!(countimdb>1 & DOCUMENT_TYPE==43))
colnames(nrfmin)=paste(colnames(nrfmin),"_firstland",sep="")

nrfmin$MAIN_PARENT=nrfmin$IMDB_ID_firstland; nrfmin$IMDB_ID_firstland=NULL

#  merge based on main parent 
cohort=merge(cohort,nrfmin,by="MAIN_PARENT",all.x=TRUE)

# merge landing year based on child
nrfmin$landingyear_child_min=nrfmin$minlandingyear_firstland
nrfmin$IMDB_ID=nrfmin$MAIN_PARENT
cohort=merge(cohort,nrfmin[,c("IMDB_ID","landingyear_child_min")],by="IMDB_ID", all.x=TRUE)

cohort$HasFather <- (cohort$gender_Parent1==1 | cohort$gender_Parent2==1)
cohort$HasMother <- (cohort$gender_Parent1==2 | cohort$gender_Parent2==2)
cohort$LandingAge_Mother <- (cohort$gender_Parent1==2)*(cohort$Landing_Year1-cohort$BirthYear_Parent1)+(cohort$gender_Parent2==2)*(cohort$Landing_Year2-cohort$BirthYear_Parent2)
cohort$LandingAge_Father <- (cohort$gender_Parent1==1)*(cohort$Landing_Year1-cohort$BirthYear_Parent1)+(cohort$gender_Parent2==1)*(cohort$Landing_Year2-cohort$BirthYear_Parent2)


cohort$LandingYear_MainParent_min=cohort$minlandingyear_firstland; cohort$minlandingyear_firstland=NULL

cohort$Landing_Age_MainParent_min=as.numeric(cohort$LandingYear_MainParent_min)-cohort$BirthYear_MainParent


cohort$landingage_childmin=as.numeric(cohort$landingyear_child_min)-cohort$BirthYear_Child

# replace landing age child and landing age main parent with min year if available

# child:
# replace child landing age if min age not missing and less
child_agefill=which(!is.na(cohort$landingage_childmin) & cohort$landingage_childmin>=0 & cohort$landingage_childmin<cohort$LANDING_AGE)
cohort$LANDING_AGE[child_agefill]=cohort$landingage_childmin[child_agefill]

# main parent:
cohort$LandingageMain=cohort$LandingYear_MainParent-cohort$BirthYear_MainParent
parent_agefill=which(cohort$Landing_Age_MainParent_min<cohort$LandingageMain)
cohort$LandingageMain[parent_agefill]=cohort$Landing_Age_MainParent_min[parent_agefill]


# clean up:
cohort[,c("Landing_Age_MainParent_min","landingage_childmin","VALID_DATE_firstland","DESTINATION_PROVINCE_firstland")]=NULL


cohort$married=ifelse(cohort$MainParent_Income_HH_MainParentAge45_49==cohort$MainParent_Income_IND_MainParentAge45_49,0,1)




# merge in the enclave information from Enclaves/Step3_immenclaves_Harrison.R

enclaves=fread("H:/Zheng_10223/Joint/dftractenclave.csv")


mean(enclaves$ratio_white, na.rm=TRUE)

cohort=merge(cohort,enclaves,by.x="MAIN_PARENT",by.y="IMDB_ID",all.x=TRUE)

cohort$visminshare=1-cohort$ratio_white
cohort$enclave=0
cohort$enclave[cohort$visminshare>0.26]=1
# AVERAGE RATIO_WHITE

# fix min landing year
cohort$minlandingyear_main=cohort$EFFECTIVE_DATE_firstland


fwrite(cohort, "H:/Zheng_10223/Joint/cohort_2025.csv")


############################ 




















